library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)2015 AAQoL: Machine Learning Approaches
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
There are 2,609 responses, some with missing data.
Machine Learning Classifications
We are going to analyze the prediction accuracy of different machine learning algorithms in predicting the source of health information, healthcare utilization outcomes, and insurance.
We will consider the following algorithms: - Random Forest - Logistic Regression
Source of Information: Family
ps(Family)# A tibble: 4 × 3
Family n pct
<fct> <int> <dbl>
1 3 1 0.0383
2 No 1258 48.2
3 Yes 1331 51.0
4 <NA> 19 0.728
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(Family~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = Family ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 42.96%
Confusion matrix:
No Yes class.error
No 423 367 0.4645570
Yes 329 501 0.3963855
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 718 52
Yes 72 778
Accuracy : 0.9235
95% CI : (0.9094, 0.9359)
No Information Rate : 0.5123
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.8467
Mcnemar's Test P-Value : 0.08796
Sensitivity : 0.9373
Specificity : 0.9089
Pos Pred Value : 0.9153
Neg Pred Value : 0.9325
Prevalence : 0.5123
Detection Rate : 0.4802
Detection Prevalence : 0.5247
Balanced Accuracy : 0.9231
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 176 127
Yes 178 224
Accuracy : 0.5674
95% CI : (0.5299, 0.6043)
No Information Rate : 0.5021
P-Value [Acc > NIR] : 0.0002993
Kappa : 0.1353
Mcnemar's Test P-Value : 0.0041966
Sensitivity : 0.6382
Specificity : 0.4972
Pos Pred Value : 0.5572
Neg Pred Value : 0.5809
Prevalence : 0.4979
Detection Rate : 0.3177
Detection Prevalence : 0.5702
Balanced Accuracy : 0.5677
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$Family,pred_noeth[,2])Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
set.seed(222)
ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
train <- rfdata[ind==1,]
test <- rfdata[ind==2,]
randomForest::randomForest(Family~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = Family ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 40.56%
Confusion matrix:
No Yes class.error
No 444 351 0.4415094
Yes 306 519 0.3709091
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 722 31
Yes 73 794
Accuracy : 0.9358
95% CI : (0.9227, 0.9472)
No Information Rate : 0.5093
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8714
Mcnemar's Test P-Value : 5.81e-05
Sensitivity : 0.9624
Specificity : 0.9082
Pos Pred Value : 0.9158
Neg Pred Value : 0.9588
Prevalence : 0.5093
Detection Rate : 0.4901
Detection Prevalence : 0.5352
Balanced Accuracy : 0.9353
'Positive' Class : Yes
Test Set
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 192 140
Yes 157 216
Accuracy : 0.5787
95% CI : (0.5413, 0.6155)
No Information Rate : 0.505
P-Value [Acc > NIR] : 5.061e-05
Kappa : 0.157
Mcnemar's Test P-Value : 0.3532
Sensitivity : 0.6067
Specificity : 0.5501
Pos Pred Value : 0.5791
Neg Pred Value : 0.5783
Prevalence : 0.5050
Detection Rate : 0.3064
Detection Prevalence : 0.5291
Balanced Accuracy : 0.5784
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$Family,pred_eth[,2])Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6283
pROC::auc(rocobj_wo)Area under the curve: 0.6175
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) No Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 7.918278 10.8645493 15.652593 66.03128
Age 16.654016 15.7460769 26.458185 170.45644
Gender 2.073286 11.5973472 10.273489 25.95290
Religion 4.617839 8.2180796 10.526109 72.95519
Employment 5.919339 3.9165010 8.723458 21.20784
Income 12.917360 0.6092244 10.286733 99.88059
EnglishSpeak 9.143444 8.3621486 15.168532 50.20068
EnglishDiff 12.539277 4.4662574 12.990193 57.47472
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(Family~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 460 346
Yes 335 479
Accuracy : 0.5796
95% CI : (0.5552, 0.6038)
No Information Rate : 0.5093
P-Value [Acc > NIR] : 7.809e-09
Kappa : 0.1592
Mcnemar's Test P-Value : 0.7016
Sensitivity : 0.5806
Specificity : 0.5786
Pos Pred Value : 0.5885
Neg Pred Value : 0.5707
Prevalence : 0.5093
Detection Rate : 0.2957
Detection Prevalence : 0.5025
Balanced Accuracy : 0.5796
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 194 156
Yes 155 200
Accuracy : 0.5589
95% CI : (0.5213, 0.5959)
No Information Rate : 0.505
P-Value [Acc > NIR] : 0.002342
Kappa : 0.1177
Mcnemar's Test P-Value : 1.000000
Sensitivity : 0.5618
Specificity : 0.5559
Pos Pred Value : 0.5634
Neg Pred Value : 0.5543
Prevalence : 0.5050
Detection Rate : 0.2837
Detection Prevalence : 0.5035
Balanced Accuracy : 0.5588
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$Family,predicted_probs)Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.5862
With ethnicity
Training Set
mod1w <- glm(Family~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1w,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 488 308
Yes 307 517
Accuracy : 0.6204
95% CI : (0.5962, 0.6441)
No Information Rate : 0.5093
P-Value [Acc > NIR] : <2e-16
Kappa : 0.2405
Mcnemar's Test P-Value : 1
Sensitivity : 0.6267
Specificity : 0.6138
Pos Pred Value : 0.6274
Neg Pred Value : 0.6131
Prevalence : 0.5093
Detection Rate : 0.3191
Detection Prevalence : 0.5086
Balanced Accuracy : 0.6203
'Positive' Class : Yes
Test Set
predict(mod1w,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 211 145
Yes 138 211
Accuracy : 0.5986
95% CI : (0.5613, 0.635)
No Information Rate : 0.505
P-Value [Acc > NIR] : 3.696e-07
Kappa : 0.1972
Mcnemar's Test P-Value : 0.7213
Sensitivity : 0.5927
Specificity : 0.6046
Pos Pred Value : 0.6046
Neg Pred Value : 0.5927
Prevalence : 0.5050
Detection Rate : 0.2993
Detection Prevalence : 0.4950
Balanced Accuracy : 0.5986
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$Family,predicted_probs)Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
AUROC
pROC::auc(rocobj_w)Area under the curve: 0.6195
Logistic Regression (ANOVA)
glm(Family~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(Family~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = Family ~ Age + Ethnicity + Gender + Religion +
Income + Employment + EnglishSpeak + EnglishDiff, family = binomial,
data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.663761 0.332233 5.008 5.51e-07 ***
Age -0.015172 0.002895 -5.241 1.60e-07 ***
EthnicityAsian Indian -0.283323 0.259350 -1.092 0.274643
EthnicityFilipino -0.209576 0.197124 -1.063 0.287707
EthnicityKorean -0.619568 0.147716 -4.194 2.74e-05 ***
EthnicityOther -0.774617 0.218603 -3.543 0.000395 ***
EthnicityVietnamese -0.758603 0.156671 -4.842 1.29e-06 ***
GenderMale 0.278524 0.090508 3.077 0.002089 **
ReligionCatholic -0.009392 0.166826 -0.056 0.955104
ReligionHindu -0.255708 0.279422 -0.915 0.360122
ReligionMuslim -0.203595 0.335527 -0.607 0.543989
ReligionNone -0.190725 0.169115 -1.128 0.259413
ReligionOther -0.599988 0.359218 -1.670 0.094867 .
ReligionProtestant -0.082695 0.169427 -0.488 0.625488
Income$10,000 - $19,999 -0.293073 0.204786 -1.431 0.152397
Income$20,000 - $29,999 -0.355893 0.208565 -1.706 0.087936 .
Income$30,000 - $39,999 -0.410814 0.204680 -2.007 0.044739 *
Income$40,000 - $49,999 -0.498557 0.216107 -2.307 0.021056 *
Income$50,000 - $59,999 -0.262414 0.213631 -1.228 0.219315
Income$60,000 - $69,999 -0.516755 0.209767 -2.463 0.013760 *
Income$70,000 and over -0.449772 0.163777 -2.746 0.006028 **
EmploymentEmployed full time -0.372539 0.096073 -3.878 0.000105 ***
EnglishSpeakNot well -0.137994 0.223937 -0.616 0.537750
EnglishSpeakVery well 0.009748 0.236859 0.041 0.967174
EnglishSpeakWell -0.214154 0.224791 -0.953 0.340752
EnglishDiffMuch 0.390928 0.143654 2.721 0.006502 **
EnglishDiffNot much 0.016162 0.133783 0.121 0.903841
EnglishDiffVery much -0.245538 0.128476 -1.911 0.055984 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3222.5 on 2324 degrees of freedom
Residual deviance: 3090.4 on 2297 degrees of freedom
AIC: 3146.4
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Family
LR Chisq Df Pr(>Chisq)
Age 27.803 1 1.343e-07 ***
Ethnicity 38.633 5 2.815e-07 ***
Gender 9.521 1 0.0020313 **
Religion 4.354 6 0.6289341
Income 9.873 7 0.1958741
Employment 15.115 1 0.0001011 ***
EnglishSpeak 3.876 3 0.2751648
EnglishDiff 19.812 3 0.0001857 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.220700 1 1.104853
Ethnicity 15.005880 5 1.311071
Gender 1.118742 1 1.057706
Religion 13.250791 6 1.240281
Income 1.462715 7 1.027536
Employment 1.262994 1 1.123830
EnglishSpeak 2.419635 3 1.158666
EnglishDiff 1.827597 3 1.105724
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: Family ~ Age + Gender + Religion + Income + Employment + EnglishSpeak +
EnglishDiff
Model 2: Family ~ Age + Ethnicity + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2302 3129.0
2 2297 3090.4 5 38.633 2.815e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 3174.988 3146.356
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 3307.272 3307.397
Source of Information: Health Professional
ps(`Heal Professionals`)# A tibble: 3 × 3
`Heal Professionals` n pct
<fct> <int> <dbl>
1 No 1326 50.8
2 Yes 1264 48.4
3 <NA> 19 0.728
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
set.seed(222)
pos<- rfdata |> filter(`Heal Professionals`=="Yes")
neg <- rfdata |> filter(`Heal Professionals`=="No")
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Heal Professionals`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Heal Professionals` ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 41.23%
Confusion matrix:
No Yes class.error
No 428 362 0.4582278
Yes 306 524 0.3686747
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 677 46
Yes 113 784
Accuracy : 0.9019
95% CI : (0.8863, 0.9159)
No Information Rate : 0.5123
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8032
Mcnemar's Test P-Value : 1.658e-07
Sensitivity : 0.9446
Specificity : 0.8570
Pos Pred Value : 0.8740
Neg Pred Value : 0.9364
Prevalence : 0.5123
Detection Rate : 0.4840
Detection Prevalence : 0.5537
Balanced Accuracy : 0.9008
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 181 126
Yes 174 225
Accuracy : 0.5751
95% CI : (0.5376, 0.6119)
No Information Rate : 0.5028
P-Value [Acc > NIR] : 6.996e-05
Kappa : 0.1508
Mcnemar's Test P-Value : 0.006657
Sensitivity : 0.6410
Specificity : 0.5099
Pos Pred Value : 0.5639
Neg Pred Value : 0.5896
Prevalence : 0.4972
Detection Rate : 0.3187
Detection Prevalence : 0.5652
Balanced Accuracy : 0.5754
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Heal Professionals`,pred_noeth[,2])Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Heal Professionals`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Heal Professionals` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 42.1%
Confusion matrix:
No Yes class.error
No 409 381 0.4822785
Yes 301 529 0.3626506
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 704 43
Yes 86 787
Accuracy : 0.9204
95% CI : (0.9061, 0.9331)
No Information Rate : 0.5123
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8404
Mcnemar's Test P-Value : 0.0002174
Sensitivity : 0.9482
Specificity : 0.8911
Pos Pred Value : 0.9015
Neg Pred Value : 0.9424
Prevalence : 0.5123
Detection Rate : 0.4858
Detection Prevalence : 0.5389
Balanced Accuracy : 0.9197
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 179 122
Yes 176 229
Accuracy : 0.5779
95% CI : (0.5405, 0.6147)
No Information Rate : 0.5028
P-Value [Acc > NIR] : 3.752e-05
Kappa : 0.1565
Mcnemar's Test P-Value : 0.002139
Sensitivity : 0.6524
Specificity : 0.5042
Pos Pred Value : 0.5654
Neg Pred Value : 0.5947
Prevalence : 0.4972
Detection Rate : 0.3244
Detection Prevalence : 0.5737
Balanced Accuracy : 0.5783
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Heal Professionals`,pred_eth[,2])Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6269
pROC::auc(rocobj_wo)Area under the curve: 0.6207
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) No Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 5.9860318 9.60330404 13.049013 68.63717
Age 7.0329957 4.75500036 10.080950 154.94856
Gender 5.3711218 -2.95635281 2.138476 25.31325
Religion 9.7787112 -0.02549784 7.782594 72.44490
Employment -1.5342540 -1.77234860 -2.779661 20.60935
Income -0.7468972 15.76511423 11.391666 100.79240
EnglishSpeak 7.2331614 19.06777183 22.152922 53.80531
EnglishDiff 3.7127127 9.91157749 10.384253 58.43659
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Heal Professionals`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 484 289
Yes 306 541
Accuracy : 0.6327
95% CI : (0.6087, 0.6562)
No Information Rate : 0.5123
P-Value [Acc > NIR] : <2e-16
Kappa : 0.2646
Mcnemar's Test P-Value : 0.5119
Sensitivity : 0.6518
Specificity : 0.6127
Pos Pred Value : 0.6387
Neg Pred Value : 0.6261
Prevalence : 0.5123
Detection Rate : 0.3340
Detection Prevalence : 0.5228
Balanced Accuracy : 0.6322
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 198 123
Yes 157 228
Accuracy : 0.6034
95% CI : (0.5662, 0.6397)
No Information Rate : 0.5028
P-Value [Acc > NIR] : 5.022e-08
Kappa : 0.2072
Mcnemar's Test P-Value : 0.0486
Sensitivity : 0.6496
Specificity : 0.5577
Pos Pred Value : 0.5922
Neg Pred Value : 0.6168
Prevalence : 0.4972
Detection Rate : 0.3229
Detection Prevalence : 0.5453
Balanced Accuracy : 0.6037
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Heal Professionals`,predicted_probs)Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6429
With ethnicity
Training Set
mod1 <- glm(`Heal Professionals`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 485 294
Yes 305 536
Accuracy : 0.6302
95% CI : (0.6062, 0.6538)
No Information Rate : 0.5123
P-Value [Acc > NIR] : <2e-16
Kappa : 0.2598
Mcnemar's Test P-Value : 0.6828
Sensitivity : 0.6458
Specificity : 0.6139
Pos Pred Value : 0.6373
Neg Pred Value : 0.6226
Prevalence : 0.5123
Detection Rate : 0.3309
Detection Prevalence : 0.5191
Balanced Accuracy : 0.6299
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 203 130
Yes 152 221
Accuracy : 0.6006
95% CI : (0.5634, 0.6369)
No Information Rate : 0.5028
P-Value [Acc > NIR] : 1.147e-07
Kappa : 0.2014
Mcnemar's Test P-Value : 0.2111
Sensitivity : 0.6296
Specificity : 0.5718
Pos Pred Value : 0.5925
Neg Pred Value : 0.6096
Prevalence : 0.4972
Detection Rate : 0.3130
Detection Prevalence : 0.5283
Balanced Accuracy : 0.6007
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Heal Professionals`,predicted_probs)Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
pROC::auc(rocobj_w)Area under the curve: 0.6418
Logistic Regression (ANOVA)
glm(`Heal Professionals`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Heal Professionals`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Heal Professionals` ~ Age + Ethnicity + Gender +
Religion + Income + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.072165 0.340255 -3.151 0.00163 **
Age 0.012777 0.002978 4.291 1.78e-05 ***
EthnicityAsian Indian -0.341146 0.263054 -1.297 0.19468
EthnicityFilipino 0.322119 0.207063 1.556 0.11979
EthnicityKorean -0.426939 0.150828 -2.831 0.00465 **
EthnicityOther -0.140765 0.220803 -0.638 0.52379
EthnicityVietnamese 0.024371 0.158357 0.154 0.87769
GenderMale 0.001829 0.092204 0.020 0.98417
ReligionCatholic -0.251763 0.169865 -1.482 0.13831
ReligionHindu -0.601409 0.282815 -2.127 0.03346 *
ReligionMuslim 0.144213 0.342914 0.421 0.67408
ReligionNone -0.348254 0.171327 -2.033 0.04208 *
ReligionOther -0.002626 0.363756 -0.007 0.99424
ReligionProtestant -0.288234 0.171901 -1.677 0.09359 .
Income$10,000 - $19,999 -0.069905 0.206716 -0.338 0.73524
Income$20,000 - $29,999 0.087219 0.208163 0.419 0.67522
Income$30,000 - $39,999 -0.022090 0.206085 -0.107 0.91464
Income$40,000 - $49,999 -0.003825 0.217525 -0.018 0.98597
Income$50,000 - $59,999 0.275124 0.215593 1.276 0.20191
Income$60,000 - $69,999 0.068545 0.212114 0.323 0.74658
Income$70,000 and over 0.471937 0.164103 2.876 0.00403 **
EmploymentEmployed full time -0.007485 0.098315 -0.076 0.93931
EnglishSpeakNot well 0.460299 0.236839 1.944 0.05195 .
EnglishSpeakVery well 1.459027 0.250189 5.832 5.49e-09 ***
EnglishSpeakWell 1.013225 0.237330 4.269 1.96e-05 ***
EnglishDiffMuch -0.369009 0.145860 -2.530 0.01141 *
EnglishDiffNot much -0.306210 0.136116 -2.250 0.02447 *
EnglishDiffVery much -0.332534 0.131642 -2.526 0.01154 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3224.0 on 2325 degrees of freedom
Residual deviance: 2984.2 on 2298 degrees of freedom
AIC: 3040.2
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Heal Professionals
LR Chisq Df Pr(>Chisq)
Age 18.618 1 1.597e-05 ***
Ethnicity 18.242 5 0.002658 **
Gender 0.000 1 0.984173
Religion 11.608 6 0.071317 .
Income 22.002 7 0.002538 **
Employment 0.006 1 0.939307
EnglishSpeak 55.796 3 4.644e-12 ***
EnglishDiff 9.309 3 0.025447 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.246279 1 1.116369
Ethnicity 14.198423 5 1.303839
Gender 1.108640 1 1.052920
Religion 12.767229 6 1.236445
Income 1.460980 7 1.027449
Employment 1.260945 1 1.122918
EnglishSpeak 2.366930 3 1.154421
EnglishDiff 1.776456 3 1.100506
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Heal Professionals` ~ Age + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Model 2: `Heal Professionals` ~ Age + Ethnicity + Gender + Religion +
Income + Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2303 3002.5
2 2298 2984.2 5 18.242 0.002658 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 3048.475 3040.234
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 3180.769 3201.287
Health Insurance
ps(`Health Insurance`)# A tibble: 3 × 3
`Health Insurance` n pct
<fct> <int> <dbl>
1 0 381 14.6
2 Yes 2207 84.6
3 <NA> 21 0.805
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Health Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Health Insurance`=="Yes")
neg <- rfdata |> filter(`Health Insurance`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Health Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Health Insurance` ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 13.58%
Confusion matrix:
0 Yes class.error
0 16 203 0.92694064
Yes 17 1384 0.01213419
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 132 2
Yes 87 1399
Accuracy : 0.9451
95% CI : (0.9328, 0.9557)
No Information Rate : 0.8648
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.719
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9986
Specificity : 0.6027
Pos Pred Value : 0.9415
Neg Pred Value : 0.9851
Prevalence : 0.8648
Detection Rate : 0.8636
Detection Prevalence : 0.9173
Balanced Accuracy : 0.8007
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 7 16
Yes 92 591
Accuracy : 0.847
95% CI : (0.8183, 0.8728)
No Information Rate : 0.8598
P-Value [Acc > NIR] : 0.8483
Kappa : 0.0653
Mcnemar's Test P-Value : 5.319e-13
Sensitivity : 0.97364
Specificity : 0.07071
Pos Pred Value : 0.86530
Neg Pred Value : 0.30435
Prevalence : 0.85977
Detection Rate : 0.83711
Detection Prevalence : 0.96742
Balanced Accuracy : 0.52217
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Health Insurance`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Health Insurance`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Health Insurance` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 13.46%
Confusion matrix:
0 Yes class.error
0 18 201 0.91780822
Yes 17 1384 0.01213419
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 156 2
Yes 63 1399
Accuracy : 0.9599
95% CI : (0.9491, 0.9689)
No Information Rate : 0.8648
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8056
Mcnemar's Test P-Value : 9.911e-14
Sensitivity : 0.9986
Specificity : 0.7123
Pos Pred Value : 0.9569
Neg Pred Value : 0.9873
Prevalence : 0.8648
Detection Rate : 0.8636
Detection Prevalence : 0.9025
Balanced Accuracy : 0.8555
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 10 12
Yes 89 595
Accuracy : 0.8569
95% CI : (0.8289, 0.8819)
No Information Rate : 0.8598
P-Value [Acc > NIR] : 0.6114
Kappa : 0.1204
Mcnemar's Test P-Value : 3.961e-14
Sensitivity : 0.9802
Specificity : 0.1010
Pos Pred Value : 0.8699
Neg Pred Value : 0.4545
Prevalence : 0.8598
Detection Rate : 0.8428
Detection Prevalence : 0.9688
Balanced Accuracy : 0.5406
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Health Insurance`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6666
pROC::auc(rocobj_wo)Area under the curve: 0.6534
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 2.400857 15.368673 16.107945 33.73518
Age 5.864815 11.132723 13.570007 72.90367
Gender -6.129335 7.159769 3.938031 12.15898
Religion 3.565750 12.763937 13.211563 36.02166
Employment 4.416671 13.315128 14.557834 11.33611
Income 19.851050 11.218608 18.695617 61.88486
EnglishSpeak 21.295650 13.206903 20.516757 31.91132
EnglishDiff -3.356981 9.806920 7.919856 26.51002
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Health Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 17 14
Yes 202 1387
Accuracy : 0.8667
95% CI : (0.8491, 0.8829)
No Information Rate : 0.8648
P-Value [Acc > NIR] : 0.4313
Kappa : 0.106
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.99001
Specificity : 0.07763
Pos Pred Value : 0.87288
Neg Pred Value : 0.54839
Prevalence : 0.86481
Detection Rate : 0.85617
Detection Prevalence : 0.98086
Balanced Accuracy : 0.53382
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 12 7
Yes 87 600
Accuracy : 0.8669
95% CI : (0.8396, 0.891)
No Information Rate : 0.8598
P-Value [Acc > NIR] : 0.3164
Kappa : 0.1657
Mcnemar's Test P-Value : 3.693e-16
Sensitivity : 0.9885
Specificity : 0.1212
Pos Pred Value : 0.8734
Neg Pred Value : 0.6316
Prevalence : 0.8598
Detection Rate : 0.8499
Detection Prevalence : 0.9731
Balanced Accuracy : 0.5548
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Health Insurance`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6906
With ethnicity
Training Set
mod1 <- glm(`Health Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 20 13
Yes 199 1388
Accuracy : 0.8691
95% CI : (0.8517, 0.8852)
No Information Rate : 0.8648
P-Value [Acc > NIR] : 0.3208
Kappa : 0.1279
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.99072
Specificity : 0.09132
Pos Pred Value : 0.87461
Neg Pred Value : 0.60606
Prevalence : 0.86481
Detection Rate : 0.85679
Detection Prevalence : 0.97963
Balanced Accuracy : 0.54102
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 9 10
Yes 90 597
Accuracy : 0.8584
95% CI : (0.8304, 0.8832)
No Information Rate : 0.8598
P-Value [Acc > NIR] : 0.5695
Kappa : 0.1125
Mcnemar's Test P-Value : 2.789e-15
Sensitivity : 0.98353
Specificity : 0.09091
Pos Pred Value : 0.86900
Neg Pred Value : 0.47368
Prevalence : 0.85977
Detection Rate : 0.84561
Detection Prevalence : 0.97309
Balanced Accuracy : 0.53722
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Health Insurance`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
pROC::auc(rocobj_w)Area under the curve: 0.6863
Logistic Regression (ANOVA)
glm(`Health Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Health Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Health Insurance` ~ Age + Ethnicity + Gender +
Religion + Income + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.759266 0.413926 -1.834 0.06661 .
Age 0.010922 0.004052 2.696 0.00702 **
EthnicityAsian Indian -0.446479 0.408123 -1.094 0.27396
EthnicityFilipino -0.438511 0.317948 -1.379 0.16784
EthnicityKorean -0.620004 0.221913 -2.794 0.00521 **
EthnicityOther -0.789804 0.310543 -2.543 0.01098 *
EthnicityVietnamese -0.307298 0.232873 -1.320 0.18697
GenderMale -0.111812 0.134537 -0.831 0.40593
ReligionCatholic 0.411070 0.250368 1.642 0.10062
ReligionHindu 0.179499 0.431971 0.416 0.67775
ReligionMuslim -0.074638 0.455515 -0.164 0.86985
ReligionNone 0.171705 0.244416 0.703 0.48236
ReligionOther -0.905441 0.420837 -2.152 0.03143 *
ReligionProtestant 0.060117 0.246737 0.244 0.80750
Income$10,000 - $19,999 0.292500 0.233643 1.252 0.21060
Income$20,000 - $29,999 0.366639 0.246194 1.489 0.13643
Income$30,000 - $39,999 0.561284 0.251517 2.232 0.02564 *
Income$40,000 - $49,999 0.248941 0.262933 0.947 0.34375
Income$50,000 - $59,999 0.480334 0.263582 1.822 0.06840 .
Income$60,000 - $69,999 0.810553 0.282193 2.872 0.00407 **
Income$70,000 and over 1.698275 0.225733 7.523 5.34e-14 ***
EmploymentEmployed full time 0.746472 0.156473 4.771 1.84e-06 ***
EnglishSpeakNot well 1.427119 0.251087 5.684 1.32e-08 ***
EnglishSpeakVery well 1.842352 0.288990 6.375 1.83e-10 ***
EnglishSpeakWell 1.534621 0.257461 5.961 2.51e-09 ***
EnglishDiffMuch 0.024480 0.221104 0.111 0.91184
EnglishDiffNot much -0.020770 0.209162 -0.099 0.92090
EnglishDiffVery much -0.236057 0.205835 -1.147 0.25145
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1855.9 on 2325 degrees of freedom
Residual deviance: 1603.7 on 2298 degrees of freedom
AIC: 1659.7
Number of Fisher Scoring iterations: 5
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Health Insurance
LR Chisq Df Pr(>Chisq)
Age 7.426 1 0.006428 **
Ethnicity 11.131 5 0.048836 *
Gender 0.690 1 0.406267
Religion 10.941 6 0.090209 .
Income 78.427 7 2.881e-14 ***
Employment 23.838 1 1.048e-06 ***
EnglishSpeak 44.575 3 1.139e-09 ***
EnglishDiff 1.902 3 0.592916
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.322065 1 1.149811
Ethnicity 14.415734 5 1.305821
Gender 1.075038 1 1.036840
Religion 12.092494 6 1.230863
Income 1.434457 7 1.026105
Employment 1.204822 1 1.097644
EnglishSpeak 2.640364 3 1.175648
EnglishDiff 1.842258 3 1.107197
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Health Insurance` ~ Age + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Model 2: `Health Insurance` ~ Age + Ethnicity + Gender + Religion + Income +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2303 1614.9
2 2298 1603.8 5 11.131 0.04884 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 1660.878 1659.747
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 1793.172 1820.8
Dental Insurance
ps(`Dental Insurance`)# A tibble: 3 × 3
`Dental Insurance` n pct
<fct> <int> <dbl>
1 0 1050 40.2
2 Yes 1529 58.6
3 <NA> 30 1.15
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Dental Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Dental Insurance`=="Yes")
neg <- rfdata |> filter(`Dental Insurance`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Dental Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Dental Insurance` ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 26.89%
Confusion matrix:
0 Yes class.error
0 384 254 0.3981191
Yes 181 799 0.1846939
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 543 32
Yes 95 948
Accuracy : 0.9215
95% CI : (0.9073, 0.9341)
No Information Rate : 0.6057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8328
Mcnemar's Test P-Value : 3.763e-08
Sensitivity : 0.9673
Specificity : 0.8511
Pos Pred Value : 0.9089
Neg Pred Value : 0.9443
Prevalence : 0.6057
Detection Rate : 0.5859
Detection Prevalence : 0.6446
Balanced Accuracy : 0.9092
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 167 79
Yes 110 347
Accuracy : 0.7312
95% CI : (0.6967, 0.7636)
No Information Rate : 0.606
P-Value [Acc > NIR] : 2.214e-12
Kappa : 0.4258
Mcnemar's Test P-Value : 0.0291
Sensitivity : 0.8146
Specificity : 0.6029
Pos Pred Value : 0.7593
Neg Pred Value : 0.6789
Prevalence : 0.6060
Detection Rate : 0.4936
Detection Prevalence : 0.6501
Balanced Accuracy : 0.7087
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Dental Insurance`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Dental Insurance`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Dental Insurance` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 26.14%
Confusion matrix:
0 Yes class.error
0 387 251 0.3934169
Yes 172 808 0.1755102
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 567 19
Yes 71 961
Accuracy : 0.9444
95% CI : (0.9321, 0.955)
No Information Rate : 0.6057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8819
Mcnemar's Test P-Value : 7.621e-08
Sensitivity : 0.9806
Specificity : 0.8887
Pos Pred Value : 0.9312
Neg Pred Value : 0.9676
Prevalence : 0.6057
Detection Rate : 0.5939
Detection Prevalence : 0.6378
Balanced Accuracy : 0.9347
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 168 76
Yes 109 350
Accuracy : 0.7368
95% CI : (0.7026, 0.7691)
No Information Rate : 0.606
P-Value [Acc > NIR] : 2.077e-13
Kappa : 0.4372
Mcnemar's Test P-Value : 0.01864
Sensitivity : 0.8216
Specificity : 0.6065
Pos Pred Value : 0.7625
Neg Pred Value : 0.6885
Prevalence : 0.6060
Detection Rate : 0.4979
Detection Prevalence : 0.6529
Balanced Accuracy : 0.7140
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Dental Insurance`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.7998
pROC::auc(rocobj_wo)Area under the curve: 0.7967
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 5.994484 17.831023 19.295433 63.13087
Age 7.250900 31.263847 31.737507 139.82902
Gender -6.854679 12.278566 5.521210 22.30855
Religion -2.362729 13.833300 9.637888 61.08582
Employment 13.831350 31.792709 37.500551 38.80522
Income 38.744005 46.182120 57.569929 159.48297
EnglishSpeak 21.815707 19.476879 32.219878 62.49798
EnglishDiff 3.099861 6.366257 7.458982 48.81062
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Dental Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 395 184
Yes 243 796
Accuracy : 0.7361
95% CI : (0.7139, 0.7574)
No Information Rate : 0.6057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4384
Mcnemar's Test P-Value : 0.005003
Sensitivity : 0.8122
Specificity : 0.6191
Pos Pred Value : 0.7661
Neg Pred Value : 0.6822
Prevalence : 0.6057
Detection Rate : 0.4920
Detection Prevalence : 0.6422
Balanced Accuracy : 0.7157
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 181 76
Yes 96 350
Accuracy : 0.7553
95% CI : (0.7218, 0.7867)
No Information Rate : 0.606
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4811
Mcnemar's Test P-Value : 0.1474
Sensitivity : 0.8216
Specificity : 0.6534
Pos Pred Value : 0.7848
Neg Pred Value : 0.7043
Prevalence : 0.6060
Detection Rate : 0.4979
Detection Prevalence : 0.6344
Balanced Accuracy : 0.7375
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Dental Insurance`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.8144
With ethnicity
Training Set
mod1 <- glm(`Dental Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 398 175
Yes 240 805
Accuracy : 0.7435
95% CI : (0.7215, 0.7646)
No Information Rate : 0.6057
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.4533
Mcnemar's Test P-Value : 0.00168
Sensitivity : 0.8214
Specificity : 0.6238
Pos Pred Value : 0.7703
Neg Pred Value : 0.6946
Prevalence : 0.6057
Detection Rate : 0.4975
Detection Prevalence : 0.6459
Balanced Accuracy : 0.7226
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 184 74
Yes 93 352
Accuracy : 0.7624
95% CI : (0.7292, 0.7935)
No Information Rate : 0.606
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4965
Mcnemar's Test P-Value : 0.1637
Sensitivity : 0.8263
Specificity : 0.6643
Pos Pred Value : 0.7910
Neg Pred Value : 0.7132
Prevalence : 0.6060
Detection Rate : 0.5007
Detection Prevalence : 0.6330
Balanced Accuracy : 0.7453
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Dental Insurance`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
AUROC
pROC::auc(rocobj_w)Area under the curve: 0.815
pROC::auc(rocobj_wo)Area under the curve: 0.8144
Logistic Regression (ANOVA)
glm(`Dental Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Dental Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Dental Insurance` ~ Age + Ethnicity + Gender +
Religion + Income + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.036310 0.382169 -2.712 0.006695 **
Age 0.001404 0.003302 0.425 0.670765
EthnicityAsian Indian 0.287705 0.332272 0.866 0.386561
EthnicityFilipino 0.388713 0.243110 1.599 0.109839
EthnicityKorean -0.718523 0.169016 -4.251 2.13e-05 ***
EthnicityOther -0.014080 0.253570 -0.056 0.955719
EthnicityVietnamese -0.043980 0.178087 -0.247 0.804940
GenderMale -0.404963 0.106236 -3.812 0.000138 ***
ReligionCatholic 0.063762 0.191971 0.332 0.739781
ReligionHindu -0.867447 0.352066 -2.464 0.013744 *
ReligionMuslim -0.991203 0.397838 -2.491 0.012721 *
ReligionNone -0.177425 0.193299 -0.918 0.358682
ReligionOther -0.466679 0.409932 -1.138 0.254941
ReligionProtestant -0.225351 0.193424 -1.165 0.243993
Income$10,000 - $19,999 -0.593746 0.226870 -2.617 0.008868 **
Income$20,000 - $29,999 -0.595717 0.222689 -2.675 0.007470 **
Income$30,000 - $39,999 -0.067002 0.216460 -0.310 0.756915
Income$40,000 - $49,999 0.120388 0.226426 0.532 0.594942
Income$50,000 - $59,999 0.434508 0.225649 1.926 0.054156 .
Income$60,000 - $69,999 0.819607 0.225122 3.641 0.000272 ***
Income$70,000 and over 1.572526 0.177728 8.848 < 2e-16 ***
EmploymentEmployed full time 0.971032 0.111288 8.725 < 2e-16 ***
EnglishSpeakNot well 0.831868 0.271836 3.060 0.002212 **
EnglishSpeakVery well 1.560834 0.286206 5.454 4.94e-08 ***
EnglishSpeakWell 1.048865 0.272039 3.856 0.000115 ***
EnglishDiffMuch -0.364049 0.168306 -2.163 0.030540 *
EnglishDiffNot much -0.152249 0.158931 -0.958 0.338085
EnglishDiffVery much -0.366967 0.157262 -2.333 0.019623 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3112.9 on 2320 degrees of freedom
Residual deviance: 2405.0 on 2293 degrees of freedom
AIC: 2461
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Dental Insurance
LR Chisq Df Pr(>Chisq)
Age 0.181 1 0.6707549
Ethnicity 32.224 5 5.364e-06 ***
Gender 14.717 1 0.0001249 ***
Religion 11.138 6 0.0842022 .
Income 253.343 7 < 2.2e-16 ***
Employment 77.918 1 < 2.2e-16 ***
EnglishSpeak 36.779 3 5.125e-08 ***
EnglishDiff 7.928 3 0.0475285 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.216428 1 1.102918
Ethnicity 16.165895 5 1.320870
Gender 1.117911 1 1.057313
Religion 14.529225 6 1.249837
Income 1.399740 7 1.024311
Employment 1.194814 1 1.093075
EnglishSpeak 2.216819 3 1.141883
EnglishDiff 1.814683 3 1.104418
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Dental Insurance` ~ Age + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Model 2: `Dental Insurance` ~ Age + Ethnicity + Gender + Religion + Income +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2298 2437.2
2 2293 2404.9 5 32.224 5.364e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 2483.177 2460.953
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 2615.421 2621.946
Urgent Care utilization in the past 12 months
ps(`Urgentcare`)# A tibble: 3 × 3
Urgentcare n pct
<fct> <int> <dbl>
1 0 2112 81.0
2 Yes 440 16.9
3 <NA> 57 2.18
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Urgentcare`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Urgentcare`=="Yes")
neg <- rfdata |> filter(`Urgentcare`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Urgentcare`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = Urgentcare ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 17.66%
Confusion matrix:
0 Yes class.error
0 1306 14 0.01060606
Yes 268 9 0.96750903
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1320 174
Yes 0 103
Accuracy : 0.891
95% CI : (0.8747, 0.9059)
No Information Rate : 0.8265
P-Value [Acc > NIR] : 3.815e-13
Kappa : 0.4946
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.3718
Specificity : 1.0000
Pos Pred Value : 1.0000
Neg Pred Value : 0.8835
Prevalence : 0.1735
Detection Rate : 0.0645
Detection Prevalence : 0.0645
Balanced Accuracy : 0.6859
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 578 117
Yes 3 2
Accuracy : 0.8286
95% CI : (0.7986, 0.8558)
No Information Rate : 0.83
P-Value [Acc > NIR] : 0.5642
Kappa : 0.0188
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.016807
Specificity : 0.994836
Pos Pred Value : 0.400000
Neg Pred Value : 0.831655
Prevalence : 0.170000
Detection Rate : 0.002857
Detection Prevalence : 0.007143
Balanced Accuracy : 0.505822
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Urgentcare`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Urgentcare`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = Urgentcare ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 17.66%
Confusion matrix:
0 Yes class.error
0 1309 11 0.008333333
Yes 271 6 0.978339350
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Urgentcare`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1320 146
Yes 0 131
Accuracy : 0.9086
95% CI : (0.8934, 0.9223)
No Information Rate : 0.8265
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5973
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.47292
Specificity : 1.00000
Pos Pred Value : 1.00000
Neg Pred Value : 0.90041
Prevalence : 0.17345
Detection Rate : 0.08203
Detection Prevalence : 0.08203
Balanced Accuracy : 0.73646
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Urgentcare`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 577 117
Yes 4 2
Accuracy : 0.8271
95% CI : (0.7971, 0.8544)
No Information Rate : 0.83
P-Value [Acc > NIR] : 0.6033
Kappa : 0.0159
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.016807
Specificity : 0.993115
Pos Pred Value : 0.333333
Neg Pred Value : 0.831412
Prevalence : 0.170000
Detection Rate : 0.002857
Detection Prevalence : 0.008571
Balanced Accuracy : 0.504961
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Urgentcare`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls > cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.4858
pROC::auc(rocobj_wo)Area under the curve: 0.5102
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 10.686415 -6.1763833 8.5003474 35.33908
Age 4.496937 7.7223094 7.6703428 100.89055
Gender -0.119335 2.2184180 0.8534296 14.10049
Religion 12.831846 -2.7225625 12.0576909 36.31904
Employment 5.839689 -3.5613963 4.4406385 12.21524
Income 3.736587 2.4811992 4.5254163 53.60340
EnglishSpeak 11.614478 0.9775154 11.5592823 25.38791
EnglishDiff 9.580822 -2.3648761 8.3388440 30.57526
Urgent care IS NOT really predicted well. Might as well just say “yes” to everything since accuracy is not significantly different from NIR.
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Urgentcare`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, train$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1320 277
Yes 0 0
Accuracy : 0.8265
95% CI : (0.8071, 0.8448)
No Information Rate : 0.8265
P-Value [Acc > NIR] : 0.516
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.0000
Specificity : 1.0000
Pos Pred Value : NaN
Neg Pred Value : 0.8265
Prevalence : 0.1735
Detection Rate : 0.0000
Detection Prevalence : 0.0000
Balanced Accuracy : 0.5000
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, test$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 581 119
Yes 0 0
Accuracy : 0.83
95% CI : (0.8001, 0.8571)
No Information Rate : 0.83
P-Value [Acc > NIR] : 0.5245
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.00
Specificity : 1.00
Pos Pred Value : NaN
Neg Pred Value : 0.83
Prevalence : 0.17
Detection Rate : 0.00
Detection Prevalence : 0.00
Balanced Accuracy : 0.50
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Urgentcare`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.5547
With ethnicity
Training Set
mod1 <- glm(`Urgentcare`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, train$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1320 277
Yes 0 0
Accuracy : 0.8265
95% CI : (0.8071, 0.8448)
No Information Rate : 0.8265
P-Value [Acc > NIR] : 0.516
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.0000
Specificity : 1.0000
Pos Pred Value : NaN
Neg Pred Value : 0.8265
Prevalence : 0.1735
Detection Rate : 0.0000
Detection Prevalence : 0.0000
Balanced Accuracy : 0.5000
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, test$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 581 119
Yes 0 0
Accuracy : 0.83
95% CI : (0.8001, 0.8571)
No Information Rate : 0.83
P-Value [Acc > NIR] : 0.5245
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.00
Specificity : 1.00
Pos Pred Value : NaN
Neg Pred Value : 0.83
Prevalence : 0.17
Detection Rate : 0.00
Detection Prevalence : 0.00
Balanced Accuracy : 0.50
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Urgentcare`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
pROC::auc(rocobj_w)Area under the curve: 0.5521
Logistic Regression (ANOVA)
glm(`Urgentcare`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Urgentcare`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = Urgentcare ~ Age + Ethnicity + Gender + Religion +
Income + Employment + EnglishSpeak + EnglishDiff, family = binomial,
data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.825864 0.461983 -6.117 9.55e-10 ***
Age 0.010671 0.003744 2.850 0.00437 **
EthnicityAsian Indian 0.074110 0.353655 0.210 0.83402
EthnicityFilipino 0.285745 0.250026 1.143 0.25310
EthnicityKorean 0.342753 0.201449 1.701 0.08886 .
EthnicityOther 0.288352 0.284633 1.013 0.31103
EthnicityVietnamese 0.433903 0.207332 2.093 0.03637 *
GenderMale 0.071345 0.117536 0.607 0.54385
ReligionCatholic 0.212812 0.204803 1.039 0.29875
ReligionHindu 0.114642 0.370370 0.310 0.75692
ReligionMuslim -0.414343 0.488870 -0.848 0.39669
ReligionNone -0.225688 0.227351 -0.993 0.32086
ReligionOther -0.084914 0.453352 -0.187 0.85142
ReligionProtestant -0.061254 0.219467 -0.279 0.78017
Income$10,000 - $19,999 0.168237 0.278113 0.605 0.54523
Income$20,000 - $29,999 0.327745 0.271081 1.209 0.22665
Income$30,000 - $39,999 -0.067060 0.283435 -0.237 0.81297
Income$40,000 - $49,999 0.256103 0.281591 0.909 0.36309
Income$50,000 - $59,999 0.141910 0.287804 0.493 0.62196
Income$60,000 - $69,999 0.287057 0.276183 1.039 0.29863
Income$70,000 and over 0.144532 0.223021 0.648 0.51694
EmploymentEmployed full time 0.024963 0.124338 0.201 0.84088
EnglishSpeakNot well 0.121938 0.333446 0.366 0.71460
EnglishSpeakVery well 0.591279 0.345074 1.713 0.08662 .
EnglishSpeakWell 0.431813 0.331860 1.301 0.19319
EnglishDiffMuch 0.064030 0.183437 0.349 0.72705
EnglishDiffNot much 0.047657 0.171562 0.278 0.78118
EnglishDiffVery much -0.284500 0.173443 -1.640 0.10094
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2111.7 on 2296 degrees of freedom
Residual deviance: 2064.6 on 2269 degrees of freedom
AIC: 2120.6
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Urgentcare
LR Chisq Df Pr(>Chisq)
Age 8.0508 1 0.004548 **
Ethnicity 5.5492 5 0.352578
Gender 0.3682 1 0.543991
Religion 5.8991 6 0.434584
Income 3.4588 7 0.839573
Employment 0.0403 1 0.840883
EnglishSpeak 6.6399 3 0.084306 .
EnglishDiff 4.1245 3 0.248332
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.198076 1 1.094567
Ethnicity 14.821139 5 1.309448
Gender 1.105467 1 1.051412
Religion 13.102885 6 1.239122
Income 1.456945 7 1.027246
Employment 1.240089 1 1.113593
EnglishSpeak 2.319308 3 1.150517
EnglishDiff 1.800663 3 1.102991
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: Urgentcare ~ Age + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Model 2: Urgentcare ~ Age + Ethnicity + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2274 2070.2
2 2269 2064.6 5 5.5492 0.3526
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 2116.189 2120.64
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 2248.195 2281.342
Physical Checkup
ps(`Physical Check-up`)# A tibble: 3 × 3
`Physical Check-up` n pct
<fct> <int> <dbl>
1 0 833 31.9
2 Yes 1740 66.7
3 <NA> 36 1.38
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Physical Check-up`=="Yes")
neg <- rfdata |> filter(`Physical Check-up`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Physical Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Physical Check-up` ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 30.7%
Confusion matrix:
0 Yes class.error
0 154 365 0.7032755
Yes 129 961 0.1183486
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 385 5
Yes 134 1085
Accuracy : 0.9136
95% CI : (0.8988, 0.9269)
No Information Rate : 0.6774
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7886
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9954
Specificity : 0.7418
Pos Pred Value : 0.8901
Neg Pred Value : 0.9872
Prevalence : 0.6774
Detection Rate : 0.6743
Detection Prevalence : 0.7576
Balanced Accuracy : 0.8686
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 60 67
Yes 170 406
Accuracy : 0.6629
95% CI : (0.6266, 0.6978)
No Information Rate : 0.6728
P-Value [Acc > NIR] : 0.7277
Kappa : 0.1347
Mcnemar's Test P-Value : 3.458e-11
Sensitivity : 0.8584
Specificity : 0.2609
Pos Pred Value : 0.7049
Neg Pred Value : 0.4724
Prevalence : 0.6728
Detection Rate : 0.5775
Detection Prevalence : 0.8193
Balanced Accuracy : 0.5596
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Physical Check-up`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Physical Check-up`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Physical Check-up` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 30.39%
Confusion matrix:
0 Yes class.error
0 163 356 0.6859345
Yes 133 957 0.1220183
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 409 4
Yes 110 1086
Accuracy : 0.9291
95% CI : (0.9155, 0.9412)
No Information Rate : 0.6774
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8287
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9963
Specificity : 0.7881
Pos Pred Value : 0.9080
Neg Pred Value : 0.9903
Prevalence : 0.6774
Detection Rate : 0.6750
Detection Prevalence : 0.7433
Balanced Accuracy : 0.8922
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 61 65
Yes 169 408
Accuracy : 0.6671
95% CI : (0.6309, 0.7019)
No Information Rate : 0.6728
P-Value [Acc > NIR] : 0.6427
Kappa : 0.1446
Mcnemar's Test P-Value : 1.658e-11
Sensitivity : 0.8626
Specificity : 0.2652
Pos Pred Value : 0.7071
Neg Pred Value : 0.4841
Prevalence : 0.6728
Detection Rate : 0.5804
Detection Prevalence : 0.8208
Balanced Accuracy : 0.5639
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Physical Check-up`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6691
pROC::auc(rocobj_wo)Area under the curve: 0.647
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity -2.1405730 12.25127 10.297182 58.32849
Age 5.5814758 28.76194 27.503520 148.18282
Gender -1.4156418 12.00199 8.760002 23.49004
Religion -7.6574489 14.90969 8.434901 63.27393
Employment -8.4761204 19.80178 15.557200 20.80459
Income 1.7335994 26.88213 23.456748 96.62966
EnglishSpeak -3.6512705 17.67902 15.079959 43.68058
EnglishDiff 0.9252643 12.65443 11.215809 49.64512
No change between accuracy and no information rate on the test set.
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Physical Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 183 110
Yes 336 980
Accuracy : 0.7228
95% CI : (0.7002, 0.7446)
No Information Rate : 0.6774
P-Value [Acc > NIR] : 4.52e-05
Kappa : 0.2841
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.8991
Specificity : 0.3526
Pos Pred Value : 0.7447
Neg Pred Value : 0.6246
Prevalence : 0.6774
Detection Rate : 0.6091
Detection Prevalence : 0.8179
Balanced Accuracy : 0.6258
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 60 65
Yes 170 408
Accuracy : 0.6657
95% CI : (0.6295, 0.7005)
No Information Rate : 0.6728
P-Value [Acc > NIR] : 0.6721
Kappa : 0.1398
Mcnemar's Test P-Value : 1.167e-11
Sensitivity : 0.8626
Specificity : 0.2609
Pos Pred Value : 0.7059
Neg Pred Value : 0.4800
Prevalence : 0.6728
Detection Rate : 0.5804
Detection Prevalence : 0.8222
Balanced Accuracy : 0.5617
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Physical Check-up`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6736
With ethnicity
Training Set
mod1 <- glm(`Physical Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 191 116
Yes 328 974
Accuracy : 0.7241
95% CI : (0.7015, 0.7458)
No Information Rate : 0.6774
P-Value [Acc > NIR] : 2.852e-05
Kappa : 0.2929
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.8936
Specificity : 0.3680
Pos Pred Value : 0.7481
Neg Pred Value : 0.6221
Prevalence : 0.6774
Detection Rate : 0.6053
Detection Prevalence : 0.8092
Balanced Accuracy : 0.6308
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 72 64
Yes 158 409
Accuracy : 0.6842
95% CI : (0.6484, 0.7184)
No Information Rate : 0.6728
P-Value [Acc > NIR] : 0.2743
Kappa : 0.1986
Mcnemar's Test P-Value : 4.327e-10
Sensitivity : 0.8647
Specificity : 0.3130
Pos Pred Value : 0.7213
Neg Pred Value : 0.5294
Prevalence : 0.6728
Detection Rate : 0.5818
Detection Prevalence : 0.8065
Balanced Accuracy : 0.5889
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Physical Check-up`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
AUROC
pROC::auc(rocobj_w)Area under the curve: 0.6884
pROC::auc(rocobj_wo)Area under the curve: 0.6736
Logistic Regression (ANOVA)
glm(`Physical Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Physical Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Physical Check-up` ~ Age + Ethnicity + Gender +
Religion + Income + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.670919 0.360385 -4.636 3.54e-06 ***
Age 0.036470 0.003463 10.531 < 2e-16 ***
EthnicityAsian Indian -0.429756 0.284941 -1.508 0.131496
EthnicityFilipino -0.070381 0.232562 -0.303 0.762171
EthnicityKorean -0.696159 0.163307 -4.263 2.02e-05 ***
EthnicityOther -0.497847 0.238551 -2.087 0.036892 *
EthnicityVietnamese 0.310191 0.179550 1.728 0.084060 .
GenderMale -0.691064 0.102474 -6.744 1.54e-11 ***
ReligionCatholic 0.277402 0.194246 1.428 0.153264
ReligionHindu 0.313433 0.306855 1.021 0.307049
ReligionMuslim 0.380113 0.369009 1.030 0.302968
ReligionNone 0.196886 0.189899 1.037 0.299833
ReligionOther -0.108269 0.371725 -0.291 0.770851
ReligionProtestant 0.072264 0.191630 0.377 0.706099
Income$10,000 - $19,999 -0.140279 0.213287 -0.658 0.510730
Income$20,000 - $29,999 -0.059952 0.216651 -0.277 0.781993
Income$30,000 - $39,999 0.223262 0.216758 1.030 0.303008
Income$40,000 - $49,999 0.022214 0.228213 0.097 0.922458
Income$50,000 - $59,999 0.377589 0.227892 1.657 0.097545 .
Income$60,000 - $69,999 0.184929 0.221144 0.836 0.403021
Income$70,000 and over 0.917047 0.175370 5.229 1.70e-07 ***
EmploymentEmployed full time 0.190892 0.109115 1.749 0.080211 .
EnglishSpeakNot well 0.880715 0.234409 3.757 0.000172 ***
EnglishSpeakVery well 1.395289 0.254386 5.485 4.14e-08 ***
EnglishSpeakWell 1.205357 0.238229 5.060 4.20e-07 ***
EnglishDiffMuch -0.264243 0.165476 -1.597 0.110296
EnglishDiffNot much -0.558953 0.153838 -3.633 0.000280 ***
EnglishDiffVery much -0.442620 0.147776 -2.995 0.002743 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2912.3 on 2311 degrees of freedom
Residual deviance: 2573.3 on 2284 degrees of freedom
AIC: 2629.3
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Physical Check-up
LR Chisq Df Pr(>Chisq)
Age 123.064 1 < 2.2e-16 ***
Ethnicity 38.328 5 3.242e-07 ***
Gender 46.509 1 9.122e-12 ***
Religion 4.232 6 0.6452799
Income 65.406 7 1.246e-11 ***
Employment 3.063 1 0.0800877 .
EnglishSpeak 32.444 3 4.220e-07 ***
EnglishDiff 17.111 3 0.0006705 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.328999 1 1.152822
Ethnicity 14.401679 5 1.305694
Gender 1.141211 1 1.068275
Religion 12.567268 6 1.234819
Income 1.493873 7 1.029084
Employment 1.280847 1 1.131745
EnglishSpeak 2.581069 3 1.171206
EnglishDiff 1.846083 3 1.107580
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Physical Check-up` ~ Age + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Model 2: `Physical Check-up` ~ Age + Ethnicity + Gender + Religion + Income +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2289 2611.6
2 2284 2573.3 5 38.328 3.242e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 2657.606 2629.278
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 2789.761 2790.162
Dental Checkup
ps(`Dentist Check-up`)# A tibble: 3 × 3
`Dentist Check-up` n pct
<fct> <int> <dbl>
1 0 1100 42.2
2 Yes 1462 56.0
3 <NA> 47 1.80
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Dentist Check-up`=="Yes")
neg <- rfdata |> filter(`Dentist Check-up`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Dentist Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = `Dentist Check-up` ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 32.75%
Confusion matrix:
0 Yes class.error
0 352 311 0.4690799
Yes 214 726 0.2276596
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 567 44
Yes 96 896
Accuracy : 0.9127
95% CI : (0.8978, 0.926)
No Information Rate : 0.5864
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8178
Mcnemar's Test P-Value : 1.63e-05
Sensitivity : 0.9532
Specificity : 0.8552
Pos Pred Value : 0.9032
Neg Pred Value : 0.9280
Prevalence : 0.5864
Detection Rate : 0.5590
Detection Prevalence : 0.6188
Balanced Accuracy : 0.9042
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 145 99
Yes 149 309
Accuracy : 0.6467
95% CI : (0.6101, 0.6821)
No Information Rate : 0.5812
P-Value [Acc > NIR] : 0.0002248
Kappa : 0.2566
Mcnemar's Test P-Value : 0.0018614
Sensitivity : 0.7574
Specificity : 0.4932
Pos Pred Value : 0.6747
Neg Pred Value : 0.5943
Prevalence : 0.5812
Detection Rate : 0.4402
Detection Prevalence : 0.6524
Balanced Accuracy : 0.6253
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Dentist Check-up`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Dentist Check-up`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = `Dentist Check-up` ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 32.81%
Confusion matrix:
0 Yes class.error
0 354 309 0.4660633
Yes 217 723 0.2308511
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 593 45
Yes 70 895
Accuracy : 0.9283
95% CI : (0.9145, 0.9404)
No Information Rate : 0.5864
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.8513
Mcnemar's Test P-Value : 0.02522
Sensitivity : 0.9521
Specificity : 0.8944
Pos Pred Value : 0.9275
Neg Pred Value : 0.9295
Prevalence : 0.5864
Detection Rate : 0.5583
Detection Prevalence : 0.6020
Balanced Accuracy : 0.9233
'Positive' Class : Yes
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 140 93
Yes 154 315
Accuracy : 0.6481
95% CI : (0.6115, 0.6835)
No Information Rate : 0.5812
P-Value [Acc > NIR] : 0.0001672
Kappa : 0.2557
Mcnemar's Test P-Value : 0.0001347
Sensitivity : 0.7721
Specificity : 0.4762
Pos Pred Value : 0.6716
Neg Pred Value : 0.6009
Prevalence : 0.5812
Detection Rate : 0.4487
Detection Prevalence : 0.6681
Balanced Accuracy : 0.6241
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Dentist Check-up`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6809
pROC::auc(rocobj_wo)Area under the curve: 0.6656
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 2.637328 19.611727 23.513297 65.33439
Age 10.744222 22.345648 27.273211 150.85344
Gender 9.136014 2.322054 8.988817 25.43963
Religion -1.532119 23.866492 23.414587 73.33316
Employment 6.797482 8.650243 12.627362 22.84096
Income 17.154001 27.930689 33.181691 118.57301
EnglishSpeak 11.087684 20.980028 26.048237 57.89318
EnglishDiff 8.755849 7.419699 12.420752 56.88743
Accuracy slightly decreased when ethnicity is added to the model, but AUC increased.
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Dentist Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 363 179
Yes 300 761
Accuracy : 0.7012
95% CI : (0.6781, 0.7235)
No Information Rate : 0.5864
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.367
Mcnemar's Test P-Value : 4.183e-08
Sensitivity : 0.8096
Specificity : 0.5475
Pos Pred Value : 0.7172
Neg Pred Value : 0.6697
Prevalence : 0.5864
Detection Rate : 0.4747
Detection Prevalence : 0.6619
Balanced Accuracy : 0.6785
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 141 93
Yes 153 315
Accuracy : 0.6496
95% CI : (0.613, 0.6849)
No Information Rate : 0.5812
P-Value [Acc > NIR] : 0.0001236
Kappa : 0.259
Mcnemar's Test P-Value : 0.0001688
Sensitivity : 0.7721
Specificity : 0.4796
Pos Pred Value : 0.6731
Neg Pred Value : 0.6026
Prevalence : 0.5812
Detection Rate : 0.4487
Detection Prevalence : 0.6667
Balanced Accuracy : 0.6258
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Dentist Check-up`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6846
With ethnicity
Training Set
mod1 <- glm(`Dentist Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 370 191
Yes 293 749
Accuracy : 0.6981
95% CI : (0.6749, 0.7205)
No Information Rate : 0.5864
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3631
Mcnemar's Test P-Value : 4.413e-06
Sensitivity : 0.7968
Specificity : 0.5581
Pos Pred Value : 0.7188
Neg Pred Value : 0.6595
Prevalence : 0.5864
Detection Rate : 0.4672
Detection Prevalence : 0.6500
Balanced Accuracy : 0.6774
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 146 94
Yes 148 314
Accuracy : 0.6553
95% CI : (0.6188, 0.6904)
No Information Rate : 0.5812
P-Value [Acc > NIR] : 3.479e-05
Kappa : 0.2732
Mcnemar's Test P-Value : 0.0006569
Sensitivity : 0.7696
Specificity : 0.4966
Pos Pred Value : 0.6797
Neg Pred Value : 0.6083
Prevalence : 0.5812
Detection Rate : 0.4473
Detection Prevalence : 0.6581
Balanced Accuracy : 0.6331
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Dentist Check-up`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
pROC::auc(rocobj_w)Area under the curve: 0.6887
Logistic Regression (ANOVA)
glm(`Dentist Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Dentist Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = `Dentist Check-up` ~ Age + Ethnicity + Gender +
Religion + Income + Employment + EnglishSpeak + EnglishDiff,
family = binomial, data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.250239 0.353450 -3.537 0.000404 ***
Age 0.022191 0.003169 7.003 2.50e-12 ***
EthnicityAsian Indian -0.751432 0.279241 -2.691 0.007124 **
EthnicityFilipino -0.295595 0.220973 -1.338 0.180995
EthnicityKorean -0.703159 0.160825 -4.372 1.23e-05 ***
EthnicityOther -0.425536 0.238447 -1.785 0.074324 .
EthnicityVietnamese -0.254040 0.171791 -1.479 0.139202
GenderMale -0.571692 0.098374 -5.811 6.19e-09 ***
ReligionCatholic -0.086887 0.183688 -0.473 0.636202
ReligionHindu -0.843103 0.298541 -2.824 0.004742 **
ReligionMuslim -0.481085 0.361333 -1.331 0.183051
ReligionNone -0.226916 0.185576 -1.223 0.221419
ReligionOther -0.728813 0.372626 -1.956 0.050479 .
ReligionProtestant -0.310774 0.185933 -1.671 0.094636 .
Income$10,000 - $19,999 -0.523858 0.211866 -2.473 0.013414 *
Income$20,000 - $29,999 -0.430694 0.212835 -2.024 0.043011 *
Income$30,000 - $39,999 0.056685 0.208564 0.272 0.785786
Income$40,000 - $49,999 -0.149626 0.221115 -0.677 0.498603
Income$50,000 - $59,999 0.503876 0.222652 2.263 0.023632 *
Income$60,000 - $69,999 0.089518 0.216328 0.414 0.679015
Income$70,000 and over 0.969868 0.170368 5.693 1.25e-08 ***
EmploymentEmployed full time 0.288165 0.104906 2.747 0.006017 **
EnglishSpeakNot well 1.012791 0.241469 4.194 2.74e-05 ***
EnglishSpeakVery well 1.796948 0.259731 6.919 4.56e-12 ***
EnglishSpeakWell 1.268193 0.243930 5.199 2.00e-07 ***
EnglishDiffMuch -0.052360 0.155977 -0.336 0.737106
EnglishDiffNot much -0.151913 0.145964 -1.041 0.297986
EnglishDiffVery much -0.394036 0.140966 -2.795 0.005186 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3128.8 on 2304 degrees of freedom
Residual deviance: 2725.0 on 2277 degrees of freedom
AIC: 2781
Number of Fisher Scoring iterations: 4
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Dentist Check-up
LR Chisq Df Pr(>Chisq)
Age 50.555 1 1.159e-12 ***
Ethnicity 23.080 5 0.0003259 ***
Gender 34.266 1 4.807e-09 ***
Religion 11.443 6 0.0756010 .
Income 124.620 7 < 2.2e-16 ***
Employment 7.562 1 0.0059627 **
EnglishSpeak 54.819 3 7.505e-12 ***
EnglishDiff 8.846 3 0.0314106 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.279353 1 1.131085
Ethnicity 14.729289 5 1.308634
Gender 1.130464 1 1.063233
Religion 12.809561 6 1.236786
Income 1.500217 7 1.029396
Employment 1.276896 1 1.129998
EnglishSpeak 2.483570 3 1.163714
EnglishDiff 1.811780 3 1.104123
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: `Dentist Check-up` ~ Age + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Model 2: `Dentist Check-up` ~ Age + Ethnicity + Gender + Religion + Income +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2282 2748.0
2 2277 2724.9 5 23.08 0.0003259 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 2794.034 2780.954
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 2926.119 2941.754
Folk Medicine
ps(`Folkmedicine`)# A tibble: 3 × 3
Folkmedicine n pct
<fct> <int> <dbl>
1 0 2189 83.9
2 Yes 348 13.3
3 <NA> 72 2.76
Random Forest
Without Ethnicity
Training Set
rfdata <- qol |>
select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`)
pos<- rfdata |> filter(`Folkmedicine`=="Yes")
neg <- rfdata |> filter(`Folkmedicine`==0)
set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))
train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])
randomForest::randomForest(`Folkmedicine`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
data=train,
importance=TRUE) -> rf_wo
print(rf_wo)
Call:
randomForest(formula = Folkmedicine ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 14.05%
Confusion matrix:
0 Yes class.error
0 1361 11 0.008017493
Yes 212 3 0.986046512
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1372 128
Yes 0 87
Accuracy : 0.9193
95% CI : (0.9048, 0.9323)
No Information Rate : 0.8645
P-Value [Acc > NIR] : 6.136e-12
Kappa : 0.5403
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.40465
Specificity : 1.00000
Pos Pred Value : 1.00000
Neg Pred Value : 0.91467
Prevalence : 0.13548
Detection Rate : 0.05482
Detection Prevalence : 0.05482
Balanced Accuracy : 0.70233
'Positive' Class : Yes
Test set
pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 598 96
Yes 2 1
Accuracy : 0.8594
95% CI : (0.8314, 0.8844)
No Information Rate : 0.8608
P-Value [Acc > NIR] : 0.5702
Kappa : 0.0117
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.010309
Specificity : 0.996667
Pos Pred Value : 0.333333
Neg Pred Value : 0.861671
Prevalence : 0.139168
Detection Rate : 0.001435
Detection Prevalence : 0.004304
Balanced Accuracy : 0.503488
'Positive' Class : Yes
ROC Curve
pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Folkmedicine`,pred_noeth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
With Ethnicity
Training Set
# rfdata <- qol |>
# select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
# na.omit() |>
# rename(Employment=`Full Time Employment`,
# EnglishSpeak=`English Speaking`,
# EnglishDiff=`English Difficulties`)
#
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
#
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]
randomForest::randomForest(`Folkmedicine`~. ,data=train,
importance=TRUE) -> rf_w
print(rf_w)
Call:
randomForest(formula = Folkmedicine ~ ., data = train, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 14.11%
Confusion matrix:
0 Yes class.error
0 1360 12 0.008746356
Yes 212 3 0.986046512
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1372 93
Yes 0 122
Accuracy : 0.9414
95% CI : (0.9287, 0.9524)
No Information Rate : 0.8645
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.694
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.56744
Specificity : 1.00000
Pos Pred Value : 1.00000
Neg Pred Value : 0.93652
Prevalence : 0.13548
Detection Rate : 0.07687
Detection Prevalence : 0.07687
Balanced Accuracy : 0.78372
'Positive' Class : Yes
Test Set
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 595 96
Yes 5 1
Accuracy : 0.8551
95% CI : (0.8267, 0.8804)
No Information Rate : 0.8608
P-Value [Acc > NIR] : 0.6923
Kappa : 0.0033
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.010309
Specificity : 0.991667
Pos Pred Value : 0.166667
Neg Pred Value : 0.861071
Prevalence : 0.139168
Detection Rate : 0.001435
Detection Prevalence : 0.008608
Balanced Accuracy : 0.500988
'Positive' Class : Yes
ROC Curve
pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Folkmedicine`,pred_eth[,2])Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))
AUROC
pROC::auc(rocobj)Area under the curve: 0.6394
pROC::auc(rocobj_wo)Area under the curve: 0.6139
Variable Importance
randomForest::varImpPlot(rf_w)
randomForest::importance(rf_w) 0 Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity 12.726453 6.3822426 15.118640 32.25924
Age 12.115027 10.9895607 15.384077 81.72420
Gender 2.006013 3.8682783 3.404765 12.07102
Religion 3.071722 0.8760851 3.184835 30.79800
Employment 10.548735 1.1738049 10.962190 11.33764
Income 7.361344 -2.8518109 5.614595 46.51250
EnglishSpeak 9.355979 -1.1478322 8.759329 23.88430
EnglishDiff 4.538927 0.6254122 4.520140 26.85372
Accuracy slightly decreased when ethnicity is added to the model.
Logistic Regression
No ethnicity
Training Set
mod1 <- glm(`Folkmedicine`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")Warning in confusionMatrix.default(pred_noeth, train$Folkmedicine, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1372 215
Yes 0 0
Accuracy : 0.8645
95% CI : (0.8467, 0.881)
No Information Rate : 0.8645
P-Value [Acc > NIR] : 0.5182
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.0000
Specificity : 1.0000
Pos Pred Value : NaN
Neg Pred Value : 0.8645
Prevalence : 0.1355
Detection Rate : 0.0000
Detection Prevalence : 0.0000
Balanced Accuracy : 0.5000
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 598 96
Yes 2 1
Accuracy : 0.8594
95% CI : (0.8314, 0.8844)
No Information Rate : 0.8608
P-Value [Acc > NIR] : 0.5702
Kappa : 0.0117
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.010309
Specificity : 0.996667
Pos Pred Value : 0.333333
Neg Pred Value : 0.861671
Prevalence : 0.139168
Detection Rate : 0.001435
Detection Prevalence : 0.004304
Balanced Accuracy : 0.503488
'Positive' Class : Yes
ROC
rocobj_wo <-pROC::roc(test$`Folkmedicine`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)
pROC::auc(rocobj_wo)Area under the curve: 0.6245
With ethnicity
Training Set
mod1 <- glm(`Folkmedicine`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial)
predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 1369 212
Yes 3 3
Accuracy : 0.8645
95% CI : (0.8467, 0.881)
No Information Rate : 0.8645
P-Value [Acc > NIR] : 0.5182
Kappa : 0.0199
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.013953
Specificity : 0.997813
Pos Pred Value : 0.500000
Neg Pred Value : 0.865908
Prevalence : 0.135476
Detection Rate : 0.001890
Detection Prevalence : 0.003781
Balanced Accuracy : 0.505883
'Positive' Class : Yes
Test Set
predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()
caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 596 97
Yes 4 0
Accuracy : 0.8551
95% CI : (0.8267, 0.8804)
No Information Rate : 0.8608
P-Value [Acc > NIR] : 0.6923
Kappa : -0.0111
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.000000
Specificity : 0.993333
Pos Pred Value : 0.000000
Neg Pred Value : 0.860029
Prevalence : 0.139168
Detection Rate : 0.000000
Detection Prevalence : 0.005739
Balanced Accuracy : 0.496667
'Positive' Class : Yes
ROC
rocobj_w <-pROC::roc(test$`Folkmedicine`,predicted_probs)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)
AUROC
pROC::auc(rocobj_w)Area under the curve: 0.6418
pROC::auc(rocobj_wo)Area under the curve: 0.6245
Logistic Regression (ANOVA)
glm(`Folkmedicine`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Folkmedicine`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1wANOVA analysis
summary(mod1w)
Call:
glm(formula = Folkmedicine ~ Age + Ethnicity + Gender + Religion +
Income + Employment + EnglishSpeak + EnglishDiff, family = binomial,
data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.43556 0.46953 -5.187 2.13e-07 ***
Age 0.02415 0.00416 5.805 6.43e-09 ***
EthnicityAsian Indian -0.11957 0.42985 -0.278 0.78088
EthnicityFilipino -0.58713 0.30650 -1.916 0.05541 .
EthnicityKorean 0.20188 0.18616 1.084 0.27816
EthnicityOther -0.43428 0.33056 -1.314 0.18893
EthnicityVietnamese -1.06441 0.24323 -4.376 1.21e-05 ***
GenderMale -0.24896 0.13467 -1.849 0.06449 .
ReligionCatholic -0.30153 0.26097 -1.155 0.24792
ReligionHindu -0.70599 0.46964 -1.503 0.13278
ReligionMuslim -2.75096 1.07403 -2.561 0.01043 *
ReligionNone -0.25484 0.23767 -1.072 0.28361
ReligionOther 0.20838 0.50280 0.414 0.67855
ReligionProtestant -0.17524 0.23207 -0.755 0.45018
Income$10,000 - $19,999 0.44302 0.31741 1.396 0.16280
Income$20,000 - $29,999 0.52783 0.33533 1.574 0.11548
Income$30,000 - $39,999 0.94768 0.31168 3.041 0.00236 **
Income$40,000 - $49,999 0.46466 0.34923 1.331 0.18334
Income$50,000 - $59,999 0.53736 0.33581 1.600 0.10955
Income$60,000 - $69,999 0.70177 0.33210 2.113 0.03459 *
Income$70,000 and over 0.61176 0.27203 2.249 0.02452 *
EmploymentEmployed full time -0.20032 0.14311 -1.400 0.16159
EnglishSpeakNot well -0.16989 0.27463 -0.619 0.53617
EnglishSpeakVery well -0.50869 0.30937 -1.644 0.10012
EnglishSpeakWell -0.13168 0.27963 -0.471 0.63770
EnglishDiffMuch -0.05387 0.21241 -0.254 0.79978
EnglishDiffNot much -0.12709 0.19831 -0.641 0.52162
EnglishDiffVery much -0.13559 0.20930 -0.648 0.51711
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1821.5 on 2283 degrees of freedom
Residual deviance: 1666.8 on 2256 degrees of freedom
AIC: 1722.8
Number of Fisher Scoring iterations: 6
car::Anova(mod1w)Analysis of Deviance Table (Type II tests)
Response: Folkmedicine
LR Chisq Df Pr(>Chisq)
Age 33.881 1 5.860e-09 ***
Ethnicity 30.371 5 1.247e-05 ***
Gender 3.446 1 0.06340 .
Religion 14.206 6 0.02742 *
Income 10.679 7 0.15327
Employment 1.967 1 0.16079
EnglishSpeak 4.608 3 0.20289
EnglishDiff 0.670 3 0.88013
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w) GVIF Df GVIF^(1/(2*Df))
Age 1.209700 1 1.099863
Ethnicity 13.559109 5 1.297846
Gender 1.091570 1 1.044782
Religion 11.613667 6 1.226726
Income 1.405444 7 1.024609
Employment 1.218399 1 1.103811
EnglishSpeak 2.479916 3 1.163428
EnglishDiff 1.825507 3 1.105513
anova(mod1,mod1w,test="LRT")Analysis of Deviance Table
Model 1: Folkmedicine ~ Age + Gender + Religion + Income + Employment +
EnglishSpeak + EnglishDiff
Model 2: Folkmedicine ~ Age + Ethnicity + Gender + Religion + Income +
Employment + EnglishSpeak + EnglishDiff
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 2261 1697.2
2 2256 1666.8 5 30.371 1.247e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC/BIC values
data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w)) AIC_wo AIC_w
1 1743.189 1722.818
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w)) BIC_wo BIC_w
1 1875.064 1883.361